I have approached this case study the following way
The data has 18 independent variables (all continuous variables) and 1 Target variable which has 3 values namely Car, Bus and Van
#Import all the necessary modules
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from scipy.stats import zscore
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from plotly.subplots import make_subplots
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Read the vehicle data
veh = pd.read_csv("C:/Users/tt002/Downloads/vehicle.csv")
veh.head()
veh.info() # gives infomarion on data type and total non null data in the column
veh.describe() # shows the descriptive statistics of the data set
veh.isna().sum() # Check for missing values in each column
# fill missing values with median column values
X = veh.drop('class',axis=1) # Predictor feature columns (18 X m)
Y = veh['class'] # Predicted class (van, bus, car) (1 X m)
imp = Imputer(missing_values='NaN', strategy='median', axis=0) # override default behavior to replace Nan with Median
imp_x = pd.DataFrame(imp.fit_transform(X),columns = list(X))
imp_x.isna().sum() # Check if all the missing values are imputed
# shape of the independent variables
fig, ax = plt.subplots(3, 6, sharex='col', sharey='row',figsize = (20,10))
m=0
for i in range(3):
for j in range(6):
imp_x.hist(column = imp_x.columns[m],ax=ax[i,j])
m+=1
# Pair plot with all the variables
plt.figure(figsize=(50,50))
sns.pairplot(veh)
x_train, x_test, y_train, y_test = train_test_split(imp_x, Y, test_size=0.3, random_state = 2)# 3. spliting the data into training and test set
# fitting SVM model on training data
clf = svm.SVC(gamma=0.025, C=3)
mod = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test) # predicting SVM model on test data
print("The model accuracy in training set:{0:.4f}".format(mod.score(x_train, y_train)))
print("Model Accuracy in Test set: {0:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
clf_poly = svm.SVC(gamma=0.025, C=3, kernel = 'poly') # changed the default rbf to polynomial kernal
mod = clf_poly.fit(x_train, y_train)
y_pred = clf_poly.predict(x_test)
print("The model accuracy in training set:{0:.4f}".format(mod.score(x_train, y_train)))
print("Model Accuracy in Test set: {0:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
# Kfold Cross Validation
kfold = KFold(n_splits= 10, random_state=2)
model = svm.SVC(gamma=0.025, C=3, kernel = 'poly')
results = cross_val_score(model, imp_x, Y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
plt.figure(figsize=(25,8))
sns.boxplot(data = imp_x,orient = 'h') # Box plot to check how different variable are spread across
df =imp_x.apply(zscore) # replacing the original values with z scores to normalize the data
df.head()
plt.figure(figsize=(25,8))
sns.boxplot(data = df,orient = 'h') # To Check how the data is normalized post transformation
# PCA
# Step 1 - Create covariance matrix
cov_matrix = np.cov(df.T)
print('Covariance Matrix \n%s', cov_matrix)
# Step 2- Get eigen values and eigen vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)] # get the % of variation explained by each principal component
cum_var_exp = np.cumsum(var_exp) # cummulative summation of variation
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
plt.title('SCREE PLOT')
plt.xlabel('Principal Component number')
plt.ylabel('Eigen value')
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
#we are generating only 7 PCA dimensions (dimensionality reduction from 18 to 8)
pca8 = PCA(n_components=8)
pca8.fit(df)
print(pca8.components_)
print(pca8.explained_variance_ratio_)
pca8 = pca8.transform(df)
df_pca = pd.DataFrame(pca8) # Principal components - top 8 components
df_pca.head()
sns.pairplot(df_pca)
# splitting the principal component data into test and training set
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(pca8, Y, test_size=0.3, random_state = 2)
clf_pca = svm.SVC(gamma=0.028, C=3, kernel = 'poly') # using Polynomial kernal as it gave an excellent accuracy with original data
mod_pca = clf_pca.fit(x_train_pca, y_train_pca)
print("The model accuracy in training set:{0:.4f}".format(mod_pca.score(x_train_pca, y_train_pca)))
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(df_pca, Y, test_size=0.3, random_state = 2)
clf_pca = svm.SVC(gamma=0.028, C=3, kernel = 'rbf')
mod_pca = clf_pca.fit(x_train_pca, y_train_pca)
print("The model accuracy in training set:{0:.4f}".format(mod_pca.score(x_train_pca, y_train_pca)))
# Kfold Cross Validation
kfold = KFold(n_splits= 10, random_state=2)
model = svm.SVC(gamma=0.025, C=3, kernel = 'rbf')
results = cross_val_score(model, df_pca,Y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
y_pred = clf_poly.predict(x_test)
print("The model accuracy in training set:{0:.4f}".format(mod.score(x_train, y_train)))
print("Model Accuracy in Test set: {0:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
cmsvm=metrics.confusion_matrix(y_test, y_pred, labels=["bus","car","van"])
df_svm = pd.DataFrame(cmsvm, index = [i for i in ["bus","car","van"]],
columns = [i for i in ["Predict bus","Predict car","Predict van"]])
ax1=sns.heatmap(df_svm, cmap="Accent", annot=True,fmt = '')
ax1.title.set_text('Support Vector Machine - Polynomial Kernel')
y_pred_pca = clf_pca.predict(x_test_pca)
print("Model Accuracy in Test set: {0:.4f}".format(metrics.accuracy_score(y_test_pca, y_pred_pca)))
print(classification_report(y_test_pca, y_pred_pca))
cmsvm_pca=metrics.confusion_matrix(y_test_pca, y_pred_pca, labels=["bus","car","van"])
df_svm_pca = pd.DataFrame(cmsvm_pca, index = [i for i in ["bus","car","van"]],
columns = [i for i in ["Predict bus","Predict car","Predict van"]])
ax1=sns.heatmap(df_svm_pca, cmap="Accent", annot=True,fmt = '')
ax1.title.set_text('Support Vector Machine - PCA - 8 components')